V20 - Using SJC disease designations for cohort assignment
V19 - updated to survey_data.2020-07-08_17.17.10.txt, then survey_data.2020-07-14_11.31.11.txt
V18 - updated to survey_data.2020-07-02_11.01.36.txt
V17 - Change sample to dataset; change project to cohort
V16 - use namer on chunks
V15 - change correlation plots to small dots
V14 - combine plots
V13 - change gene count threshold from categporical to numeric
V12 - exclude datasets with fewer than 100 measured genes and use pre-generated survey_data
V11 - color consistency
V10 - calculate non genic and all exonic counts for correlations analysis
V9 - add gene count; import readdist.txt for more nuanced comparison of cor with gene count
reference range for unmapped reads is calculated based on unmapped reads/total
reference range for duplicate reads is calculated based on duplicate reads/mapped reads where mapped reads = total mapped and multi-mapped (<20x) reads)
reference range for non-exonic reads is calculated based on non-exonic/non duplicate reads) where non duplicate reads = exonic and non-exonic non dupe reads)
The composition fractions are not dependent on higher level ones
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.2
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(viridis)
## Loading required package: viridisLite
library(knitr)
library(ggthemes) # for theme_linedraw()
library(ggpubr) # for stat_cor
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(cowplot) # for ggdraw and draw_label
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggpubr':
##
## get_legend
## The following object is masked from 'package:ggthemes':
##
## theme_map
library(RColorBrewer)
library(pander)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
inputs_dir <- "inputs"
data_file <- tibble(files = list.files(inputs_dir)) %>%
arrange(desc(files)) %>%
top_n(1, wt = files) %>%
pull(files)
print(data_file)
## [1] "survey_data.2020-07-21_13.04.32.txt"
min_genes_gt_0 <- 100
is_outlier <- function(x) {
(x > quantile(x, 0.75) + 1.5*IQR(x)) |
(x < quantile(x, 0.25) - 1.5*IQR(x))
}
most_common_val_w_percent <- function(value_vector = c("a", "a", "b")){
mc_vals <- tabyl(value_vector) %>%
arrange(desc(n)) %>%
top_n(1, n)
paste0(round(100*mc_vals$percent), "% ", mc_vals$value_vector)
}
# StatBin2 allows depiction of empty bins as blank instead of a horizontal line:
# https://stackoverflow.com/questions/57128090/remove-baseline-color-for-geom-histogram
StatBin2 <- ggproto(
"StatBin2",
StatBin,
compute_group = function (data, scales, binwidth = NULL, bins = NULL,
center = NULL, boundary = NULL,
closed = c("right", "left"), pad = FALSE,
breaks = NULL, origin = NULL, right = NULL,
drop = NULL, width = NULL) {
if (!is.null(breaks)) {
if (!scales$x$is_discrete()) {
breaks <- scales$x$transform(breaks)
}
bins <- ggplot2:::bin_breaks(breaks, closed)
}
else if (!is.null(binwidth)) {
if (is.function(binwidth)) {
binwidth <- binwidth(data$x)
}
bins <- ggplot2:::bin_breaks_width(scales$x$dimension(), binwidth,
center = center, boundary = boundary,
closed = closed)
}
else {
bins <- ggplot2:::bin_breaks_bins(scales$x$dimension(), bins,
center = center, boundary = boundary,
closed = closed)
}
res <- ggplot2:::bin_vector(data$x, bins, weight = data$weight, pad = pad)
# drop 0-count bins completely before returning the dataframe
res <- res[res$count > 0, ]
res
})
col_spec <- cols(
.default = col_double(),
dataset_id = col_character(),
disease = col_character(),
pedaya = col_character(),
gender = col_character(),
cohort_name = col_character(),
cohort_code = col_character(),
doi_link = col_character(),
source_repository = col_character(),
repository_cohort_accession = col_character(),
repository_dataset_accession = col_character()
)
counts_and_meta <- read_tsv(file.path(inputs_dir, data_file), col_types = col_spec) %>%
group_by(cohort_code) %>%
mutate(n_in_cohort = n())
# brewer.pal(8, "Paired")
these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C",
"#FDBF6F", "#FF7F00")
# light blue, dark blue, etc: green, red, orange (then purple and brown)
these_colors <- c("#1F78B4", "#1F78B4", "#33A02C", "#33A02C", "#E31A1C", "#E31A1C",
"#FF7F00", "#FF7F00")
names(these_colors) <- c("not assigned", "Total reads", "Non-exonic reads", "Exonic reads", "Duplicate reads", "Non-duplicate reads", "Unmapped reads", "Mapped reads")
these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")
nrow(counts_and_meta)
## [1] 2179
counts_and_meta %>% tabyl(disease)
## disease n percent
## acute lymphoblastic leukemia 680 0.3120697568
## acute megakaryoblastic leukemia 103 0.0472693896
## acute myeloid leukemia 221 0.1014226709
## acute undifferentiated leukemia 1 0.0004589261
## adrenocortical adenoma 1 0.0004589261
## adrenocortical cancer 2 0.0009178522
## adrenocortical carcinoma 20 0.0091785223
## alveolar rhabdomyosarcoma 40 0.0183570445
## cholangiocarcinoma 1 0.0004589261
## choroid plexus carcinoma 25 0.0114731528
## desmoplastic small round cell tumor 4 0.0018357045
## dysembryoplastic neuroepithelial tumor 13 0.0059660395
## embryonal rhabdomyosarcoma 42 0.0192748967
## embryonal tumor with multilayered rosettes 1 0.0004589261
## ependymoma 98 0.0449747591
## epithelioid sarcoma 1 0.0004589261
## Ewing sarcoma 70 0.0321248279
## fibrolamellar hepatocellular carcinoma 3 0.0013767783
## fibromatosis 1 0.0004589261
## gastrointestinal stromal tumor 4 0.0018357045
## germ cell tumor 1 0.0004589261
## glioblastoma multiforme 29 0.0133088573
## glioma 193 0.0885727398
## hepatoblastoma 1 0.0004589261
## hepatocellular carcinoma 3 0.0013767783
## hypereosinophillic syndrome 1 0.0004589261
## juvenile myelomonocytic leukemia 1 0.0004589261
## leukemia 1 0.0004589261
## lymphoma 49 0.0224873795
## medulloblastoma 201 0.0922441487
## melanoma 7 0.0032124828
## melanotic neuroectodermal tumor 1 0.0004589261
## meningioma 1 0.0004589261
## myofibromatosis 2 0.0009178522
## nasopharyngeal carcinoma 2 0.0009178522
## neuroblastoma 12 0.0055071134
## neuroendocrine carcinoma 1 0.0004589261
## not reported 2 0.0009178522
## not reported - QC fail by PI 1 0.0004589261
## osteosarcoma 157 0.0720513997
## ovarian serous cystadenocarcinoma 1 0.0004589261
## PEComa 1 0.0004589261
## pleuropulmonary blastoma 1 0.0004589261
## retinoblastoma 2 0.0009178522
## rhabdoid tumor 65 0.0298301973
## rhabdomyosarcoma 53 0.0243230840
## spindle cell/sclerosing rhabdomyosarcoma 3 0.0013767783
## supratentorial embryonal tumor NOS 18 0.0082606700
## synovial sarcoma 22 0.0100963745
## undifferentiated sarcoma NOS 3 0.0013767783
## undifferentiated spindle cell sarcoma 2 0.0009178522
## wilms tumor 11 0.0050481872
disease_freq_table <- counts_and_meta$disease %>%
str_to_sentence %>%
str_replace(" nos$", " NOS") %>%
as.factor %>%
fct_infreq %>%
fct_lump(prop=0.01) %>%
tabyl (var1 = "Disease") %>%
adorn_pct_formatting(digits = 1)
colnames(disease_freq_table)[1]="Disease"
write_tsv(disease_freq_table, "results/disease_freq_table.txt")
disease_freq_table %>%
kable %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| Disease | n | percent |
|---|---|---|
| Acute lymphoblastic leukemia | 680 | 31.2% |
| Acute myeloid leukemia | 221 | 10.1% |
| Medulloblastoma | 201 | 9.2% |
| Glioma | 193 | 8.9% |
| Osteosarcoma | 157 | 7.2% |
| Acute megakaryoblastic leukemia | 103 | 4.7% |
| Ependymoma | 98 | 4.5% |
| Ewing sarcoma | 70 | 3.2% |
| Rhabdoid tumor | 65 | 3.0% |
| Rhabdomyosarcoma | 53 | 2.4% |
| Lymphoma | 49 | 2.2% |
| Embryonal rhabdomyosarcoma | 42 | 1.9% |
| Alveolar rhabdomyosarcoma | 40 | 1.8% |
| Glioblastoma multiforme | 29 | 1.3% |
| Choroid plexus carcinoma | 25 | 1.1% |
| Synovial sarcoma | 22 | 1.0% |
| Other | 131 | 6.0% |
# many are leukemias
table(str_detect(counts_and_meta$disease, "leukemia"))
##
## FALSE TRUE
## 1172 1007
library(ggthemes)
old_cohort_size_histogram <- ggplot(counts_and_meta) +
geom_histogram(aes(n_in_cohort, group=cohort_code),
fill = "lightgrey", color = "black", stat = StatBin2) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) + xlab("Cohort size") +
ylab("Cohorts") +
theme(legend.position="none")
cohort_size_histogram <- ggplot(counts_and_meta %>%
select(cohort_code, n_in_cohort) %>%
distinct ) +
geom_histogram(aes(n_in_cohort),
fill = "lightgrey", color = "black", stat = StatBin2,
breaks = seq(0,500, by=10)) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
xlab("Cohort size") +
scale_y_continuous("Cohorts", breaks = seq(0, 8, 2)) +
theme(legend.position="none")
cohort_size_histogram
# for cohorts that are more than 75% one disease, color by that disease
n_distinct(counts_and_meta$cohort_code)
## [1] 48
sort(unique(counts_and_meta$n_in_cohort))
## [1] 3 4 6 7 8 10 15 17 19 22 23 24 25 26 31 35 39
## [18] 43 44 54 56 62 63 65 78 82 84 88 89 103 127 137 141 337
counts_and_meta %>%
select(cohort_code, n_in_cohort) %>%
distinct %>%
arrange(desc(n_in_cohort)) %>%
kable
| cohort_code | n_in_cohort |
|---|---|
| Cohort_1 | 337 |
| Cohort_2 | 141 |
| Cohort_3 | 137 |
| Cohort_4 | 127 |
| Cohort_5 | 103 |
| Cohort_6 | 89 |
| Cohort_7 | 88 |
| Cohort_8 | 84 |
| Cohort_9 | 82 |
| Cohort_10 | 78 |
| Cohort_11 | 65 |
| Cohort_12 | 63 |
| Cohort_13 | 62 |
| Cohort_14 | 56 |
| Cohort_15 | 54 |
| Cohort_16 | 44 |
| Cohort_17 | 43 |
| Cohort_18 | 39 |
| Cohort_19 | 35 |
| Cohort_20 | 31 |
| Cohort_21 | 31 |
| Cohort_22 | 26 |
| Cohort_23 | 25 |
| Cohort_24 | 25 |
| Cohort_26 | 24 |
| Cohort_25 | 24 |
| Cohort_30 | 23 |
| Cohort_27 | 23 |
| Cohort_28 | 23 |
| Cohort_29 | 23 |
| Cohort_31 | 22 |
| Cohort_32 | 19 |
| Cohort_33 | 19 |
| Cohort_34 | 17 |
| Cohort_35 | 15 |
| Cohort_36 | 10 |
| Cohort_37 | 8 |
| Cohort_38 | 8 |
| Cohort_41 | 7 |
| Cohort_39 | 7 |
| Cohort_40 | 7 |
| Cohort_42 | 6 |
| Cohort_44 | 6 |
| Cohort_43 | 6 |
| Cohort_45 | 6 |
| Cohort_47 | 4 |
| Cohort_46 | 4 |
| Cohort_48 | 3 |
counts_and_meta %>%
select(cohort_code, n_in_cohort) %>%
distinct %>%
pull(n_in_cohort) %>%
median
## [1] 24.5
repo_info <- read_tsv("inputs/repos.tsv")
## Parsed with column specification:
## cols(
## repo_name = col_character(),
## repo_abbrev = col_character(),
## repo_url = col_character(),
## source_accession_pattern = col_character()
## )
repo_info %>%
rename(`Name`=repo_name, `Abbreviation`=repo_abbrev, URL = repo_url) %>%
select(-source_accession_pattern) %>%
kable %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| Name | Abbreviation | URL |
|---|---|---|
| Short Read Archive | SRA | https://www.ncbi.nlm.nih.gov/sra |
| European Genome-phenome Archive | EGA | https://www.ebi.ac.uk/ega/home |
| St. Jude Cloud | SJC | https://www.stjude.cloud/ |
| Kids First Data Portal | KFDP | https://portal.kidsfirstdrc.org/ |
| Database of Genotypes and Phenotypes | dbGaP | https://www.ncbi.nlm.nih.gov/gap/ |
counts_and_meta %>%
select(cohort_code, cohort_name, source_repository, repository_cohort_accession) %>% # removed DOI link
# group_by(across(c(-disease)))
distinct %>%
kable %>%
kable_styling(bootstrap_options = "striped", full_width = F)
| cohort_code | cohort_name | source_repository | repository_cohort_accession |
|---|---|---|---|
| Cohort_1 | TARGET-10 | dbGaP | phs000218 |
| Cohort_3 | TARGET-20 | dbGaP | phs000218 |
| Cohort_30 | TARGET-21 | dbGaP | phs000218 |
| Cohort_41 | TARGET-30 | dbGaP | phs000218 |
| Cohort_7 | TARGET-40 | dbGaP | phs000218.v15.p5 |
| Cohort_7 | TARGET-40 | dbGaP | phs000218 |
| Cohort_47 | TARGET-50 | dbGaP | phs000218 |
| Cohort_11 | TARGET-52 | dbGaP | phs000218 |
| Cohort_12 | EGAD00001001098 | EGA | EGAD00001001098 |
| Cohort_27 | EGAD00001000356 | EGA | EGAD00001000356 |
| Cohort_32 | EGAD00001002680 | EGA | EGAD00001002680 |
| Cohort_23 | EGAD00001001666 | EGA | EGAD00001001666 |
| Cohort_26 | phs000900.v1.p1 | dbGaP | phs000900.v1.p1 |
| Cohort_20 | 10.24370/SD_BHJXBDQK | KFDB | CBTTC-HGG-PA-01 |
| Cohort_20 | 10.24370/SD_BHJXBDQK | KFDB | CBTTC_PNET_PA_01 |
| Cohort_20 | 10.24370/SD_BHJXBDQK | KFDB | CBTTC-PolyA |
| Cohort_19 | phs000699.v1.p1 | dbGaP | phs000699.v1.p1 |
| Cohort_34 | EGAD00001000158 | EGA | EGAD00001000158 |
| Cohort_42 | EGAD00001000328 | EGA | EGAD00001000328 |
| Cohort_28 | EGAD00001000617 | EGA | EGAD00001000617 |
| Cohort_18 | EGAD00001001620 | EGA | EGAD00001001620 |
| Cohort_25 | EGAD00001000648 | EGA | EGAD00001000648 |
| Cohort_46 | SRP006575 | SRA | SRP006575 |
| Cohort_44 | SJC_Other | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_5 | SJC_AMLM | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_2 | SJC_BALL | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_16 | SJC_CBF | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_24 | SJC_CPC | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_37 | SJC_E | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_10 | SJC_EPD | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_21 | SJC_ERG | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_14 | SJC_ETV | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_9 | SJC_HGG | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_39 | SJC_HYPO | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_43 | SJC_INF | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_15 | SJC_LGG | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_38 | SJC_MB | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_40 | SJC_MEL | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_29 | SJC_OS | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_22 | SJC_PHALL | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_17 | SJC_RHB | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_45 | SJC_WLM | SJC | Pediatric Cancer Genome Project (PCGP) |
| Cohort_36 | EGAD00001000826 | EGA | EGAD00001000826 |
| Cohort_48 | SRP040454 | SRA | SRP040454 |
| Cohort_8 | phs000720.v2.p1 | dbGaP | phs000720.v2.p1 |
| Cohort_13 | phs000768.v2.p1 | dbGaP | phs000768.v2.p1 |
| Cohort_6 | phs000673.v2.p1 | dbGaP | phs000673.v2.p1 |
| Cohort_31 | EGAD00001001927 | EGA | EGAD00001001927 |
| Cohort_4 | EGAD00001003279 | EGA | EGAD00001003279 |
| Cohort_35 | SRP092501 | SRA | SRP092501 |
| Cohort_33 | SRP126664 | SRA | SRP126664 |
table(counts_and_meta$gender, useNA = "always")
##
## female male not reported unknown <NA>
## 739 1010 310 82 38
# samples with reported genders
counts_and_meta %>%
filter(! is.na(gender)) %>%
filter(! gender %in% c("not reported", "unknown")) %>%
tabyl(gender) %>%
adorn_totals
## gender n percent
## female 739 0.4225272
## male 1010 0.5774728
## Total 1749 1.0000000
table(counts_and_meta$pedaya, useNA = "always")
##
## No Yes, age < 30 years <NA>
## 51 2033 95
# at least one target sample is >30
n_ped_aya <- sum(counts_and_meta$pedaya=="Yes, age < 30 years" | counts_and_meta$age_at_dx<30, na.rm = TRUE)
n_ped_aya
## [1] 2104
n_ped_aya/nrow(counts_and_meta)
## [1] 0.9655805
# more than 95% of which are pediatric <30; of the Datasets with specific ages at diagnosis reported, the median was 7
median(counts_and_meta$age_at_dx, na.rm = TRUE)
## [1] 8.96
table(is.na(counts_and_meta$age_at_dx))
##
## FALSE TRUE
## 1739 440
seq_len_round_25 <- round(counts_and_meta$seq_length / 25) * 25
table(seq_len_round_25, useNA = "always")
## seq_len_round_25
## 25 50 75 100 125 <NA>
## 4 238 366 1544 27 0
tabyl(seq_len_round_25)
## seq_len_round_25 n percent
## 25 4 0.001835704
## 50 238 0.109224415
## 75 366 0.167966957
## 100 1544 0.708581918
## 125 27 0.012391005
table(is.na(counts_and_meta$seq_length))
##
## FALSE
## 2179
table(counts_and_meta$seq_length)
##
## 36 50 51 75 76 81 100 101 110 125
## 4 11 227 279 40 47 98 1431 15 27
summary(counts_and_meta$seq_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 36.00 76.00 101.00 91.51 101.00 125.00
IQR(counts_and_meta$seq_length, na.rm = TRUE)
## [1] 25
old_read_length_histogram <- ggplot(counts_and_meta) +
geom_histogram(aes(seq_length, group=cohort_code),
fill = "lightgrey", color = "black", stat = StatBin2) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
xlab("Sequence length") +
ylab("Datasets")
read_length_histogram <- ggplot(counts_and_meta) +
geom_histogram(aes(seq_length),
fill = "lightgrey", color = "black", stat = StatBin2,
binwidth = 1) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
xlab("Sequence length") +
ylab("Datasets") +
expand_limits(x=0)
read_length_histogram
subset(counts_and_meta, Mapped > Total_reads)
## # A tibble: 0 x 23
## # Groups: cohort_code [0]
## # … with 23 variables: dataset_id <chr>, Total_reads <dbl>, Mapped <dbl>,
## # MND <dbl>, MEND <dbl>, seq_length <dbl>, disease <chr>,
## # age_at_dx <dbl>, pedaya <chr>, gender <chr>, cohort_code <chr>,
## # cohort_name <chr>, doi_link <chr>, source_repository <chr>,
## # repository_cohort_accession <chr>, repository_dataset_accession <chr>,
## # genes_expressed_above_0 <dbl>, genes_expressed_above_1 <dbl>,
## # genes_expressed_above_2 <dbl>, genes_expressed_above_3 <dbl>,
## # genes_expressed_above_4 <dbl>, genes_expressed_above_5 <dbl>,
## # n_in_cohort <int>
subset(counts_and_meta, MND > Mapped)
## # A tibble: 0 x 23
## # Groups: cohort_code [0]
## # … with 23 variables: dataset_id <chr>, Total_reads <dbl>, Mapped <dbl>,
## # MND <dbl>, MEND <dbl>, seq_length <dbl>, disease <chr>,
## # age_at_dx <dbl>, pedaya <chr>, gender <chr>, cohort_code <chr>,
## # cohort_name <chr>, doi_link <chr>, source_repository <chr>,
## # repository_cohort_accession <chr>, repository_dataset_accession <chr>,
## # genes_expressed_above_0 <dbl>, genes_expressed_above_1 <dbl>,
## # genes_expressed_above_2 <dbl>, genes_expressed_above_3 <dbl>,
## # genes_expressed_above_4 <dbl>, genes_expressed_above_5 <dbl>,
## # n_in_cohort <int>
subset(counts_and_meta, MEND > MND)
## # A tibble: 0 x 23
## # Groups: cohort_code [0]
## # … with 23 variables: dataset_id <chr>, Total_reads <dbl>, Mapped <dbl>,
## # MND <dbl>, MEND <dbl>, seq_length <dbl>, disease <chr>,
## # age_at_dx <dbl>, pedaya <chr>, gender <chr>, cohort_code <chr>,
## # cohort_name <chr>, doi_link <chr>, source_repository <chr>,
## # repository_cohort_accession <chr>, repository_dataset_accession <chr>,
## # genes_expressed_above_0 <dbl>, genes_expressed_above_1 <dbl>,
## # genes_expressed_above_2 <dbl>, genes_expressed_above_3 <dbl>,
## # genes_expressed_above_4 <dbl>, genes_expressed_above_5 <dbl>,
## # n_in_cohort <int>
min(counts_and_meta$Total_reads)
## [1] 206916
Total_read_counts <- counts_and_meta %>%
ungroup %>%
mutate(dataset_value = Total_reads/1e6) %>%
summarize(variance= var(dataset_value),
min = min(dataset_value),
max = max(dataset_value),
median = median(dataset_value),
p25 = quantile(dataset_value, 0.25),
p75 = quantile(dataset_value, 0.75),
iqr = IQR(dataset_value))
kable(Total_read_counts %>%
select(-variance, -p25, -p75), digits = 1)
| min | max | median | iqr |
|---|---|---|---|
| 0.2 | 668 | 61.2 | 53.3 |
total_read_count_histogram <- ggplot(counts_and_meta) +
geom_histogram(aes(Total_reads/1E6),
fill = "lightgrey", color = "black", stat = StatBin2) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
xlab("Total reads") +
ylab("Datasets")
total_read_count_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
read_counts_with_read_type_fractions <- counts_and_meta %>%
arrange(desc(Total_reads)) %>%
mutate(
pct_unmapped_of_total = (Total_reads - Mapped) / Total_reads,
pct_dupe_of_mapped = (Mapped - MND) / Mapped,
pct_non_exonic_of_non_dupe = (MND - MEND) / MND
)
read_type_fractions_long <- read_counts_with_read_type_fractions %>%
pivot_longer(cols = starts_with("pct_"), names_to = "read_type_fraction_name", values_to = "dataset_value")
comparison_of_distributions <- read_type_fractions_long %>%
group_by(read_type_fraction_name) %>%
summarize(variance= var(dataset_value),
min = min(dataset_value),
max = max(dataset_value),
median = median(dataset_value),
p25 = quantile(dataset_value, 0.25),
p75 = quantile(dataset_value, 0.75),
iqr = IQR(dataset_value))
kable(comparison_of_distributions %>%
mutate_if(is.double, ~100*.), digits = 3)
| read_type_fraction_name | variance | min | max | median | p25 | p75 | iqr |
|---|---|---|---|---|---|---|---|
| pct_dupe_of_mapped | 5.884 | 2.853 | 99.997 | 26.887 | 13.450 | 43.033 | 29.583 |
| pct_non_exonic_of_non_dupe | 4.094 | 4.495 | 97.000 | 24.769 | 16.189 | 37.277 | 21.088 |
| pct_unmapped_of_total | 0.605 | 0.710 | 76.677 | 3.414 | 2.740 | 6.006 | 3.266 |
kable(comparison_of_distributions %>%
select(-variance, -p25, -p75) %>%
mutate_if(is.double, ~100*.), digits = 0)
| read_type_fraction_name | min | max | median | iqr |
|---|---|---|---|---|
| pct_dupe_of_mapped | 3 | 100 | 27 | 30 |
| pct_non_exonic_of_non_dupe | 4 | 97 | 25 | 21 |
| pct_unmapped_of_total | 1 | 77 | 3 | 3 |
comparison_of_distributions
## # A tibble: 3 x 8
## read_type_fraction_na… variance min max median p25 p75 iqr
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 pct_dupe_of_mapped 0.0588 0.0285 1.00 0.269 0.135 0.430 0.296
## 2 pct_non_exonic_of_non… 0.0409 0.0450 0.970 0.248 0.162 0.373 0.211
## 3 pct_unmapped_of_total 0.00605 0.00710 0.767 0.0341 0.0274 0.0601 0.0327
# in 77 datasets, more than 25% of reads are unmapped
table(read_counts_with_read_type_fractions$pct_unmapped_of_total>0.25)
##
## FALSE TRUE
## 2102 77
# 426 datasets have more than 50% duplicates (Fig 2A).
read_counts_with_read_type_fractions %>%
mutate(above_50 = pct_dupe_of_mapped>0.50) %>%
tabyl(above_50)
## above_50 n percent
## FALSE 1753 0.8044975
## TRUE 426 0.1955025
fiftypct <- read_counts_with_read_type_fractions %>%
group_by(cohort_code) %>%
summarize(has_a_dataset_with_more_than_50pct_dupes = any(pct_dupe_of_mapped>0.50),
median_pct_dupe_of_mapped = median(pct_dupe_of_mapped)) %>%
mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_dupes)
fiftypct %>% tabyl(median_lt_50)
## median_lt_50 n percent
## FALSE 7 0.1458333
## TRUE 41 0.8541667
fiftypct %>% tabyl(has_a_dataset_with_more_than_50pct_dupes)
## has_a_dataset_with_more_than_50pct_dupes n percent
## FALSE 15 0.3125
## TRUE 33 0.6875
fiftypct %>% tabyl(median_lt_50_and_has_a_dataset)
## median_lt_50_and_has_a_dataset n percent
## FALSE 22 0.4583333
## TRUE 26 0.5416667
read_counts_with_read_type_fractions %>%
mutate(above_50 = pct_non_exonic_of_non_dupe>0.50) %>%
tabyl(above_50)
## above_50 n percent
## FALSE 1849 0.8485544
## TRUE 330 0.1514456
fiftypcte <- read_counts_with_read_type_fractions %>%
group_by(cohort_code) %>%
summarize(has_a_dataset_with_more_than_50pct_ne = any(pct_non_exonic_of_non_dupe>0.50),
median_pct_dupe_of_mapped = median(pct_non_exonic_of_non_dupe)) %>%
mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_ne)
fiftypcte %>% tabyl(median_lt_50)
## median_lt_50 n percent
## FALSE 3 0.0625
## TRUE 45 0.9375
fiftypcte %>% tabyl(has_a_dataset_with_more_than_50pct_ne)
## has_a_dataset_with_more_than_50pct_ne n percent
## FALSE 36 0.75
## TRUE 12 0.25
fiftypcte %>% tabyl(median_lt_50_and_has_a_dataset)
## median_lt_50_and_has_a_dataset n percent
## FALSE 39 0.8125
## TRUE 9 0.1875
cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads,
read_counts_with_read_type_fractions$pct_dupe_of_mapped,
method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34
this_data <- read_counts_with_read_type_fractions %>%
select(Total_reads, pct_dupe_of_mapped)
## Adding missing grouping variables: `cohort_code`
this_plot_title <- paste("Correlation of Total_reads and % Duplicates")
these_stats <- this_data %>%
ungroup %>%
summarize(r_corr = round(cor(Total_reads, pct_dupe_of_mapped), 2))
fs1 <- ggplot(this_data,
aes(x=Total_reads/1E6, y=pct_dupe_of_mapped)) +
geom_point(alpha = 0.5, shape = 20, size =0.1) +
geom_smooth(method = "lm", color = "black", se = FALSE) +
geom_label(data = these_stats,
aes(label=paste0("r=", r_corr)),
x=max(this_data$Total_reads/1E6),
y=max(this_data$pct_dupe_of_mapped),
hjust = 1, vjust = 1
) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
ggtitle(this_plot_title) +
theme(legend.position="none", plot.title=element_text(size=22)) +
xlab("Read counts (million)") +
ylab("% Duplicates")
ggsave("Figure_S1.png", fs1 + ggtitle(""))
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'
colors_for_read_types <- these_colors_with_MEND
names(colors_for_read_types) <- gsub(" reads", "", names(these_colors_with_MEND))
colors_for_read_types <- c(colors_for_read_types,
"Genome" = "darkblue",
"Exon" = "darkblue",
"Aligned reads" = "darkgrey")
plot1_colnames <- c("label", "x1", "x2", "y1", "y2")
read_type_schematic_data_as_vector <- c("Duplicate", 3, 4, 5, 6,
"MEND", 3, 4, 4, 5,
"MEND", 2.5, 3.5, 3, 4,
"Non-exonic", 4.5, 5.5, 3, 4,
"Unmapped", 6, 7, 3, 4,
"Exon", 2.5, 4, 1, 2,
"Aligned reads", 2.8, 2.8, 6, 7)
rows_in_schematic <- length(read_type_schematic_data_as_vector)/length(plot1_colnames)
read_type_schematic_data <- matrix(read_type_schematic_data_as_vector,
ncol = length(plot1_colnames), byrow = TRUE,
dimnames = list(c(1:rows_in_schematic), plot1_colnames)) %>%
as_tibble %>%
type_convert() %>%
mutate(
base_color = colors_for_read_types[match(label, names(colors_for_read_types))],
border_color = case_when(
label %in% c("Aligned reads", "Genome", "MEND", "Exon") ~ NA_character_,
TRUE ~ base_color),
fill_color = ifelse(label %in% c("MEND", "Exon"), base_color, "white"),
text_color = case_when(
label %in% c("MEND", "Exon")~ "white",
TRUE ~ base_color),
mid_bar_x = map2_dbl(x1, x2, function(x, y) mean(c(x,y))),
mid_bar_y = map2_dbl(y1, y2, function(x, y) mean(c(x,y)))
)
## Parsed with column specification:
## cols(
## label = col_character(),
## x1 = col_double(),
## x2 = col_double(),
## y1 = col_double(),
## y2 = col_double()
## )
padding_size = 0.35
end_of_box <- sort(read_type_schematic_data$x2,partial=length(read_type_schematic_data$x2)-1)[length(read_type_schematic_data$x2)-1] + padding_size
read_type_schematic <- ggplot(read_type_schematic_data,
aes(xmin=x1, xmax=x2, ymin=y1, ymax = y2,
fill = fill_color, color = border_color)) +
geom_rect() +
geom_segment(x=min(read_type_schematic_data$x1-padding_size), y=1.5, xend = 5.75, yend = 1.5, color = "darkblue") +
geom_text(aes(label = label, x = mid_bar_x, y=mid_bar_y,
color = text_color),
size = 4,
fontface = "bold") +
geom_rect(aes(xmin = min(x1 - padding_size),
ymin = 2.5,
xmax = end_of_box,
ymax=max(y2 + padding_size)),
fill = NA, color = "darkgrey", size = 0.25) +
scale_fill_identity() +
scale_color_identity() +
theme_void() +
theme(
panel.grid.major.x = element_line(color="lightgrey", size=0.2),
) +
scale_x_continuous(
breaks = seq(min(read_type_schematic_data$x1) - padding_size + 0.25, end_of_box , 0.25))
read_type_schematic
stat_names <- c("pct_unmapped_of_total", "pct_dupe_of_mapped", "pct_non_exonic_of_non_dupe")
stat_labels <- c("Unmapped", "Duplicate", "Non-exonic")
complement_stat_names <- c("Mapped reads", "Non-duplicate reads", "Exonic reads")
divisor_name = c("total", "mapped", "non-duplicate")
comparison_of_remaining <- read_type_fractions_long %>%
mutate(read_type_fraction_name = complement_stat_names[match(read_type_fraction_name, stat_names)]) %>%
group_by(read_type_fraction_name) %>%
summarize(min = round(1-max(dataset_value), 2),
max = round(1-min(dataset_value), 2),
median = round(1-median(dataset_value), 2),
p25 = round(1-quantile(dataset_value, 0.25), 2),
p75 = round(1-quantile(dataset_value, 0.75), 2)) %>%
mutate(bar_label = paste0 (read_type_fraction_name, "\n(", 100*min, "-", 100*max, "% of ", divisor_name[match(read_type_fraction_name, complement_stat_names)], "; x͂=",100* median, "%)"))
positive_bars <- comparison_of_remaining %>%
select(-min, -max, -p25, -p75) %>%
mutate(read_type_fraction_label = read_type_fraction_name,
position = "1") %>%
rename(abs_median = median)
negative_bars <- tibble(read_type_fraction_name = positive_bars$read_type_fraction_name,
read_type_fraction_label = stat_labels[match(read_type_fraction_name, complement_stat_names)],
bar_label = read_type_fraction_label,
abs_median = 1-positive_bars$abs_median,
position = "2")
total_bar <- tibble(read_type_fraction_name = "Total reads",
read_type_fraction_label = read_type_fraction_name,
bar_label = "Total reads\n(100% of reads)",
abs_median = 1,
position = "1"
)
MEND_stats_of_total <- counts_and_meta %>%
arrange(desc(Total_reads)) %>%
mutate(
pct_MEND_of_total = (Total_reads - MEND) / Total_reads
) %>%
ungroup %>%
summarize(min = round(1-max(pct_MEND_of_total), 2),
max = round(1-min(pct_MEND_of_total), 2),
median = round(1-median(pct_MEND_of_total), 2))
MEND_bar <- tibble(read_type_fraction_name = "MEND reads",
read_type_fraction_label = read_type_fraction_name,
bar_label = with(MEND_stats_of_total, paste0 ("MEND reads\n(", 100*min, "-", 100*max, "% of total reads; x͂=",100* median, "%)")),
abs_median = 1,
position = "1"
)
bar_names_in_order <- c("Total reads", "Mapped reads", "Non-duplicate reads", "Exonic reads", "MEND reads")
all_bars <- bind_rows(positive_bars, negative_bars, total_bar, MEND_bar) %>%
mutate(read_type_fraction_name = factor(read_type_fraction_name,
levels = bar_names_in_order))
cat_space <- all_bars %>% filter(position == 1) %>%
arrange(read_type_fraction_name) %>%
select(read_type_fraction_name, abs_median) %>%
ungroup %>%
mutate(lagged1_abs_median = lag(abs_median, default =1),
lagged2_abs_median = lag(abs_median, n=2, default =1),
category_space = lagged1_abs_median*lagged2_abs_median
) %>%
select(read_type_fraction_name, category_space)
these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")
all_bars_rel <- left_join(all_bars, cat_space, by = "read_type_fraction_name") %>%
mutate(rel_median=ifelse(read_type_fraction_name == "MEND reads",
abs_median[read_type_fraction_label=="Exonic reads"]*
category_space[read_type_fraction_label=="Exonic reads"],
abs_median*category_space),
read_type_fraction_name = factor(read_type_fraction_name, levels = rev(bar_names_in_order)),
position = factor(position, levels = rev(sort(unique(position)))),
bar_color = these_colors_with_MEND[match(read_type_fraction_name, names(these_colors_with_MEND))],
fill_val = ifelse(position == 1, bar_color, NA),
border_color_val = bar_color,
text_color = ifelse(position ==1, "white", "black"),
font_size = ifelse(abs_median<0.1, 3,3.5),
text_angle = ifelse(abs_median<0.1, 90,0)
)
read_type_fractions_schematic <- ggplot(all_bars_rel, aes(y=read_type_fraction_name,
x=rel_median, group = position)) +
geom_col(aes(fill = fill_val,
color = border_color_val
)) +
geom_text(aes(label=bar_label,
color = text_color,
size = font_size,
angle= text_angle),
position = position_stack(vjust = .5),
fontface = "bold") +
scale_fill_identity() +
scale_color_identity() +
scale_size_identity() +
theme_void() +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
ggplot(all_bars_rel, aes(y=read_type_fraction_name,
x=rel_median, group = position)) +
geom_col(aes(fill = fill_val,
color = border_color_val
)) +
geom_text(aes(label=read_type_fraction_label,
color = text_color,
size = font_size,
angle = text_angle),
position = position_stack(vjust = .5),
fontface = "bold") +
scale_fill_identity() +
scale_color_identity() +
scale_size_identity() +
theme_void() +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
read_type_fractions_schematic
figure_name <- "f1"
right_side <- plot_grid(read_type_schematic,
read_type_fractions_schematic,
nrow=2,
rel_heights = c(1, 2),
labels = c("C", "D"))
left_side <- plot_grid(cohort_size_histogram,
read_length_histogram,
nrow=2,
labels = c("A", "B"))
f1 <- plot_grid(left_side,
right_side,
ncol = 2) +
draw_label(paste(figure_name, Sys.time()),
x = 0, y = 0.02, hjust = 0, size = 6)
f1
ggsave("Figure1.png", f1, height=6, width=10)
stat_names <- c("pct_unmapped_of_total", "pct_dupe_of_mapped", "pct_non_exonic_of_non_dupe")
stat_label <- c("% unmapped", "% duplicate \n(of mapped)", "% non-exonic \n(of non-duplicate)")
names(stat_label) <- stat_names
colors_for_stat_names <- these_colors[c("Mapped reads", "Non-duplicate reads","Exonic reads")]
names(colors_for_stat_names) <- stat_names
these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C",
"#FDBF6F", "#FF7F00")
# light blue, dark blue, etc: green, red, orange (then purple and brown)
names(these_colors) <- c("not assigned", "Total reads", "Non-exonic reads", "Exonic reads", "Duplicate reads", "Non-duplicate reads", "Unmapped reads", "Mapped reads")
# 6 x 11 was a good ratio, but the text was too small
read_type_fractions_long_for_histogram <- read_type_fractions_long %>%
mutate(read_type_fraction_name = factor(read_type_fraction_name,
levels = stat_names))
read_type_fractions_histogram <- ggplot(read_type_fractions_long_for_histogram) +
geom_histogram(aes(x = dataset_value, color = read_type_fraction_name),
fill = "white", stat = StatBin2) +
scale_color_manual(values = colors_for_stat_names) +
facet_wrap(~read_type_fraction_name,
nrow = 1,
scales = "free_y",
labeller = labeller(
read_type_fraction_name = stat_label
)
) +
ylab("Datasets") +
xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_histogram$dataset_id)), " datasets")) +
scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
theme_minimal() +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
theme(strip.text.x = element_text(size = 12, face = "bold")) +
theme(legend.position = "none")
read_type_fractions_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
read_counts_with_MEND_fractions <- counts_and_meta %>%
arrange(desc(Total_reads)) %>%
mutate(pct_MEND_of_total = MEND /Total_reads)
ggplot(read_counts_with_MEND_fractions) +
geom_histogram(aes(pct_MEND_of_total), stat = StatBin2) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
MEND_pct <- read_counts_with_MEND_fractions %>%
ungroup %>%
mutate(dataset_value = pct_MEND_of_total) %>%
summarize(variance= var(dataset_value),
min = min(dataset_value),
max = max(dataset_value),
median = median(dataset_value),
p25 = quantile(dataset_value, 0.25),
p75 = quantile(dataset_value, 0.75),
iqr = IQR(dataset_value))
kable(MEND_pct %>%
select(-variance, -p25, -p75) %>%
mutate_if(is.double, ~100*.), digits = 0)
| min | max | median | iqr |
|---|---|---|---|
| 0 | 79 | 50 | 31 |
head(sort(read_counts_with_MEND_fractions$pct_MEND_of_total))
## [1] 0.0000267009 0.0002533711 0.0003651723 0.0003767257 0.0003997294
## [6] 0.0004279584
sum(read_counts_with_MEND_fractions$pct_MEND_of_total<0.25)
## [1] 355
read_counts_with_MEND_fractions %>%
ungroup %>%
mutate(pct_under_25 = pct_MEND_of_total<0.25) %>%
tabyl(pct_under_25) %>%
adorn_totals()
## pct_under_25 n percent
## FALSE 1824 0.8370812
## TRUE 355 0.1629188
## Total 2179 1.0000000
read_counts_with_MEND_fractions %>%
ungroup %>%
filter(Mapped > 30E6) %>%
mutate(pct_under_25 = pct_MEND_of_total<0.25) %>%
tabyl(pct_under_25) %>%
adorn_totals()
## pct_under_25 n percent
## FALSE 1739 0.8368624
## TRUE 339 0.1631376
## Total 2078 1.0000000
figure_name <- "fracs_by_group"
read_type_fractions_long_for_boxplot <- read_type_fractions_long_for_histogram %>%
ungroup %>%
mutate(boxplot_label = fct_reorder(paste0(as.character(cohort_code), ", n=", n_in_cohort), n_in_cohort))
read_type_fractions_per_cohort_boxplot <- ggplot(read_type_fractions_long_for_boxplot) +
geom_boxplot(aes(x = dataset_value, y=boxplot_label), outlier.size = 0.5) +
facet_wrap(~read_type_fraction_name,
nrow = 1,
labeller = labeller(
read_type_fraction_name = stat_label
)
) +
ylab("Cohorts") +
xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_boxplot$dataset_id)), " datasets")) +
scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
theme_minimal() +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
theme(strip.text.x = element_text(size = 12, face = "bold")) +
theme(legend.position = "none")
read_type_fractions_per_cohort_boxplot
read_type_names <- c("Total_reads", "Mapped", "MND", "MEND")
this_cohort_name <- "EGAD00001003279"
this_cohort_code <- subset(read_counts_with_read_type_fractions, cohort_name == this_cohort_name)$cohort_code[1]
this_cohort_code
## [1] "Cohort_4"
read_counts_with_read_type_fractions %>%
filter(cohort_code == this_cohort_code) %>%
mutate(gt_98 = pct_dupe_of_mapped > 0.98) %>%
tabyl(gt_98) %>%
add_totals_row
## Warning: 'add_totals_row' is deprecated.
## Use 'adorn_totals("row")' instead.
## See help("Deprecated")
## gt_98 n percent
## FALSE 55 0.4330709
## TRUE 72 0.5669291
## Total 127 1.0000000
read_counts_with_read_type_fractions %>%
filter(cohort_code == this_cohort_code) %>%
filter(pct_dupe_of_mapped > 0.98) %>%
pull(Total_reads) %>% min
## [1] 178294341
counts_and_meta_long <- counts_and_meta %>%
pivot_longer(cols = c(Total_reads, Mapped, MEND, MND), names_to = "read_type_name", values_to = "dataset_value") %>%
ungroup %>%
mutate(read_type_name = factor(read_type_name, levels = read_type_names))
counts_and_meta_long_one_proj <- counts_and_meta_long %>%
filter(cohort_code == this_cohort_code)
table(counts_and_meta_long_one_proj$read_type_name)
##
## Total_reads Mapped MND MEND
## 127 127 127 127
# 78 datasets in the cohort have fewer than 0.2 million MEND reads.
sum((counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND"))$dataset_value < 200000)
## [1] 78
those_datasets <- counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND", dataset_value < 200000) %>% pull(dataset_id)
summary((counts_and_meta_long_one_proj %>% filter(read_type_name=="Total_reads", dataset_id %in% those_datasets))$dataset_value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 178294341 199916664 208667776 211606891 223832191 247618832
max_plots_to_include <- 10
n_datasets_to_plot <- ifelse(n_distinct(counts_and_meta_long_one_proj$dataset_id)>max_plots_to_include,
max_plots_to_include,
n_distinct(counts_and_meta_long_one_proj$dataset_id))
plot_word <- ifelse(n_datasets_to_plot==max_plots_to_include, max_plots_to_include, paste("all", n_distinct(counts_and_meta_long_one_proj$dataset_id)))
set.seed(100)
datasets_to_plot <- sample(unique(counts_and_meta_long_one_proj$dataset_id), n_datasets_to_plot)
ggplot(subset(counts_and_meta_long_one_proj, dataset_id %in% datasets_to_plot)) +
geom_col(aes(x=read_type_name, y=dataset_value/1e6, fill = read_type_name)) +
facet_wrap(~dataset_id, ncol = 1, strip.position="right") +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
theme(strip.text.y = element_text(angle = 0)) +
scale_fill_brewer(palette = "Set1") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
ggtitle(paste("Read counts for", plot_word, "datasets from", this_cohort_code)) +
coord_flip()
ggplot(counts_and_meta_long_one_proj %>% filter(read_type_name == "MEND")) +
geom_histogram(aes(dataset_value/1e6), stat = StatBin2) +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
ggtitle(paste("Distribution of MEND counts in", this_cohort_code))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
bl <- colorRampPalette(c("navy","royalblue","lightskyblue"))(200)
re <- colorRampPalette(c("mistyrose", "red2","darkred"))(200)
ggplot(counts_and_meta %>%
filter(cohort_code == this_cohort_code) %>%
mutate(pct_dupes = (Mapped - MND)/Mapped)) +
geom_point(aes(x=MEND/1e6, y=pct_dupes, fill = Total_reads/1e6), shape=21, color = "black") +
theme_minimal(base_size = 20) +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
scale_fill_gradientn(colours = c(bl,"white", re)) +
ggtitle(paste("Relationship of dupe fraction to MEND count in", this_cohort_code), "For detecting whether datasets with less info were sequenced more deeply. \nNormally, higher depth datasets have more MEND reads and more duplicate \nreads than the same dataset sequenced at a lower total depth.")
counts_and_meta_long_one_proj %>%
select(seq_length, dataset_id) %>%
distinct %>%
tabyl(seq_length)
## seq_length n percent
## 51 127 1
gene_counts_long <- counts_and_meta %>%
pivot_longer(cols = starts_with("genes_"), names_to = "Expression_threshold") %>%
mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))
ggplot(gene_counts_long) +
geom_histogram(aes(x=value/1e3, fill = Expression_threshold), stat = StatBin) +
facet_wrap(~Expression_threshold) +
theme_minimal() +
scale_fill_brewer(palette = "Set1") +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
# theme_linedraw(base_size = 20) +
theme(legend.position = "none") +
# scale_fill_brewer(palette = "Set1") +
ylab("Datasets") +
xlab("Measured genes")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
sort(counts_and_meta$genes_expressed_above_0) [1:100]
## [1] 9 9 12 12 13 14 14 15 16 16 17
## [12] 18 18 18 18 18 18 19 19 19 19 20
## [23] 20 20 20 20 20 21 21 21 21 21 21
## [34] 22 22 22 23 23 23 23 23 24 24 24
## [45] 24 24 24 24 25 25 25 26 26 26 26
## [56] 27 27 27 27 28 28 28 28 29 29 29
## [67] 29 29 30 30 31 31 32 33 35 36 37
## [78] 40 6098 7107 11830 15323 16793 16837 16943 17233 17531 18288
## [89] 18588 18704 19626 19693 19749 19781 19942 19963 20063 20120 20203
## [100] 20247
library(corrr)
counts_and_meta %>%
ungroup %>%
select_if(is.double) %>%
correlate() %>%
filter(rowname %in% read_type_names) %>%
rename(Read_type_count=rowname) %>%
mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
arrange(Read_type_count) %>%
select(c(Read_type_count, starts_with("genes_expressed"))) %>%
kable(caption = "Correlations with all datasets", digits = 2)
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
| Read_type_count | genes_expressed_above_0 | genes_expressed_above_1 | genes_expressed_above_2 | genes_expressed_above_3 | genes_expressed_above_4 | genes_expressed_above_5 |
|---|---|---|---|---|---|---|
| Total_reads | -0.20 | -0.40 | -0.40 | -0.40 | -0.39 | -0.39 |
| Mapped | -0.19 | -0.41 | -0.41 | -0.41 | -0.40 | -0.39 |
| MND | 0.51 | 0.24 | 0.20 | 0.18 | 0.17 | 0.16 |
| MEND | 0.48 | 0.27 | 0.26 | 0.26 | 0.26 | 0.26 |
counts_and_meta %>%
filter(genes_expressed_above_0 > min_genes_gt_0) %>%
ungroup %>%
select_if(is.double) %>%
correlate() %>%
filter(rowname %in% read_type_names) %>%
rename(Read_type_count=rowname) %>%
mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
arrange(Read_type_count) %>%
select(c(Read_type_count, starts_with("genes_expressed"))) %>%
kable(caption = "Correlations with all but low gene count datasets", digits = 2)
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
| Read_type_count | genes_expressed_above_0 | genes_expressed_above_1 | genes_expressed_above_2 | genes_expressed_above_3 | genes_expressed_above_4 | genes_expressed_above_5 |
|---|---|---|---|---|---|---|
| Total_reads | 0.13 | -0.26 | -0.28 | -0.28 | -0.28 | -0.28 |
| Mapped | 0.21 | -0.25 | -0.26 | -0.27 | -0.27 | -0.27 |
| MND | 0.51 | 0.04 | 0.02 | 0.00 | 0.00 | 0.00 |
| MEND | 0.44 | 0.08 | 0.09 | 0.11 | 0.11 | 0.12 |
expr_gene_and_read_counts_to_plot <- counts_and_meta %>%
pivot_longer (cols = c(MND, MEND, Total_reads, Mapped), names_to = "Read_type", values_to = "Read_count") %>%
pivot_longer (cols = starts_with("genes_expressed"), names_to = "Expression_threshold", values_to = "Gene_count") %>%
mutate(Read_type = factor(Read_type, levels = read_type_names))
datasets_with_normal_expressed_gene_numbers <- counts_and_meta %>%
filter(genes_expressed_above_0 > min_genes_gt_0) %>%
pull(dataset_id)
datasets_with_outlier_gene_counts <- counts_and_meta %>%
ungroup %>%
filter(is_outlier(genes_expressed_above_0)) %>%
pull(dataset_id)
datasets_with_outlier_depths <- counts_and_meta %>%
ungroup %>%
filter(is_outlier(Total_reads)) %>%
pull(dataset_id)
plot_gene_count_correlations <- function(this_data = expr_gene_and_read_counts_to_plot, this_plot_title = "gene count correlations"){
this_data <- this_data %>%
mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))
these_stats <- this_data %>%
group_by(Read_type, Expression_threshold) %>%
summarize(r_corr = round(cor(Gene_count, Read_count), 2))
# unique(this_data$Read_type)
# dput(unique(this_data$Read_type))
# these_colors_with_MEND
colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00",
"MND"="#E31A1C", "MEND"="#000000")
ggplot(this_data,
aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) +
geom_point(alpha = 0.5, shape = 20, size =0.1) +
geom_smooth(method = "lm", color = "black") +
geom_label(data = these_stats,
aes(label=paste0("r=", r_corr)),
x=max(this_data$Read_count/1E6),
y=max(this_data$Gene_count/1e3),
hjust = 1, vjust = 1
) +
theme_minimal() +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5)) +
scale_color_manual(values = colors_for_correlation_plot) +
facet_grid(Read_type~Expression_threshold) +
ggtitle(this_plot_title) +
theme(legend.position="none") +
xlab("Read counts (million)") +
ylab("Gene counts (thousand)")
}
fs2 <- plot_gene_count_correlations(expr_gene_and_read_counts_to_plot,
"Correlations calculated across all datasets")
fs2
## `geom_smooth()` using formula 'y ~ x'
ggsave("Figure_S2.png", fs2 + ggtitle(""))
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'
plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>%
filter(dataset_id %in% datasets_with_normal_expressed_gene_numbers),
paste("Correlations in datasets with more than", min_genes_gt_0, "genes expressed above 0"))
## `geom_smooth()` using formula 'y ~ x'
plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>%
filter(! dataset_id %in% datasets_with_outlier_gene_counts),
paste("Correlations calculated across all but outlier gene count datasets"))
## `geom_smooth()` using formula 'y ~ x'
plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>%
filter(! dataset_id %in% datasets_with_outlier_depths),
paste("Correlations calculated across all but outlier read depth datasets"))
## `geom_smooth()` using formula 'y ~ x'
plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>%
filter(! dataset_id %in% datasets_with_outlier_depths,
! dataset_id %in% datasets_with_outlier_gene_counts
),
paste("Correlations calculated across all but datasets with outlier read depth or gene count"))
## `geom_smooth()` using formula 'y ~ x'
plot_gene_count_correlations(
expr_gene_and_read_counts_to_plot %>%
filter(disease == "acute lymphoblastic leukemia"),
"Correlations calculated among acute lymphoblastic leukemia datasets")
## `geom_smooth()` using formula 'y ~ x'
max_total_read_depth <- 250*1e6
print(min_genes_gt_0)
## [1] 100
sum(counts_and_meta$genes_expressed_above_0 <= min_genes_gt_0)
## [1] 78
sum(counts_and_meta$Total_reads > max_total_read_depth)
## [1] 105
# min_genes_gt_0
exclude_for_depth <- counts_and_meta %>%
filter(Total_reads > max_total_read_depth) %>%
pull(dataset_id)
exclude_for_gene_count <- counts_and_meta %>%
filter(genes_expressed_above_0 < min_genes_gt_0) %>%
pull(dataset_id)
this_data <- expr_gene_and_read_counts_to_plot %>%
filter(
Expression_threshold %in% paste0("genes_expressed_above_", 1:3),
! dataset_id %in% exclude_for_depth,
! dataset_id %in% exclude_for_gene_count
) %>%
mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))
n_distinct(this_data$dataset_id)
## [1] 1996
this_plot_title <- paste("Correlations calculated across all but datasets with excessive read depth or low gene count")
these_stats <- this_data %>%
group_by(Read_type, Expression_threshold) %>%
summarize(r_corr = round(cor(Gene_count, Read_count), 2))
colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00",
"MND"="#E31A1C", "MEND"="#000000")
cor_plots_excluding_high_read_depth_and_low_gene_count_datasets <- ggplot(this_data,
aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) +
geom_point(alpha = 0.5, shape = 20, size =0.1) +
geom_smooth(method = "lm", color = "black") +
geom_label(data = these_stats,
aes(label=paste0("r=", r_corr)),
x=max(this_data$Read_count/1E6),
y=max(this_data$Gene_count/1e3),
hjust = 1, vjust = 1
) +
theme_minimal() +
theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
axis.line.y = element_line(color="darkgrey", size = 0.5),
strip.text = element_text(face = "bold")) +
scale_color_manual(values = colors_for_correlation_plot) +
facet_grid(Read_type~Expression_threshold) +
theme(legend.position="none") +
scale_x_continuous("Read counts (million)", n.breaks = 4) +
ylab("Gene counts (thousand)")
cor_plots_excluding_high_read_depth_and_low_gene_count_datasets
## `geom_smooth()` using formula 'y ~ x'
# Figure 2
f2a <- read_type_fractions_histogram +
theme(strip.text.x = element_text(size = 10, face = "bold"),
axis.title.x=element_blank(),
axis.text.x=element_blank())
f2b <- read_type_fractions_per_cohort_boxplot +
theme(strip.text.x = element_blank(),
strip.background = element_blank(),
strip.text = element_blank())
f2ab <- plot_grid(f2a,
f2b,
ncol = 1,
rel_heights = c(1.5, 6),
align = "v",
labels = "AUTO")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
f2c <- plot_grid(cor_plots_excluding_high_read_depth_and_low_gene_count_datasets,
labels = "C")
## `geom_smooth()` using formula 'y ~ x'
f2 <- plot_grid(f2ab,
f2c,
nrow=1,
rel_widths = c(4,2.3))
figure_name <- "f2"
f2_labeled <- f2 + draw_label(paste(figure_name, Sys.time()),
x = 0, y = 0.02, hjust = 0, size = 6)
f2_labeled
ggsave("Figure2.png", f2_labeled, height=8, width=10)
ggplot(read_counts_with_read_type_fractions) +
geom_point(aes(x=seq_length, y=pct_dupe_of_mapped))
ggplot(read_counts_with_read_type_fractions) +
geom_boxplot(aes(x=seq_length, y=pct_dupe_of_mapped, group = seq_length))
cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads,
read_counts_with_read_type_fractions$pct_dupe_of_mapped,
method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34
pander(sessionInfo(), compact = FALSE)
R version 3.6.0 (2019-04-26)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
locale: en_US.UTF-8||en_US.UTF-8||en_US.UTF-8||C||en_US.UTF-8||en_US.UTF-8
attached base packages:
other attached packages:
loaded via a namespace (and not attached):